---
title: 'Section 6: Empirical Synthesis'
author: "Milan Svolik"
date: "4/20/2023"
output:
  pdf_document: default
  html_document:
    df_print: paged
---

```{r setup, include=FALSE}
rm(list = ls(all = TRUE))
library(tidyverse)
library(stargazer)
library(lmtest)
library(reshape2)
library(foreign)
library(ggplot2)
library(broom)
library(estimatr)
library(texreg)


set.seed(5877)
```

# Load the survey data
```{R}
load("data/df_surveys.RData")
```


### Prepare the survey data for re-weighting
* I will only need vote_march and vote_june
* For re-weighting, 
  + use the most detailed information available and therefore include the THIRD (PARTY) category (i.e. differentiate between those who abstain and those who vote for a third party or candidate)
  + remove NAs and the June undecided (these cannot be used for re-weighting)
```{R}
df_surveys_synthesis <- df_surveys %>%
  select(rid, vote_march, vote_june, vote_march_3, vote_june_3) %>%
  mutate(vote_june = fct_recode(vote_june, NULL = "Undecided")) %>%
  filter(!is.na(vote_march) & !is.na(vote_june)) %>%
  mutate(vote_june = fct_drop(vote_june)) %>%
  mutate(vote_march_char = as.character(vote_march), 
         vote_june_char = as.character(vote_june))

```

### The survey data frame that will be re-weighted
```{R}
library(survey)
df_torake <- svydesign(ids=~1, data=df_surveys_synthesis)
```
# Load the election results
```{R}
load("data/df_2019istanbul_results.RData")
```

## Summarize how aggregating lower-level election outcomes up in an evenly divided electoratethe swamps the heterogeneity in outcome shifts

### The ballot box level
```{R}
df_results %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    AKP_share_march = AKP_march/REG_march, 
    CHP_share_march = CHP_march/REG_march, 
    ABS_share_march = ABS_march/REG_march, 
    THIRD_share_march = THIRD_march/REG_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june,
    AKP_share_june = AKP_june/REG_june, 
    CHP_share_june = CHP_june/REG_june, 
    ABS_share_june = ABS_june/REG_june, 
    THIRD_share_june = THIRD_june/REG_june,
    AKP_share_diff = AKP_share_june - AKP_share_march, 
    CHP_share_diff = CHP_share_june - CHP_share_march, 
    ABS_share_diff = ABS_share_june - ABS_share_march, 
    THIRD_share_diff = THIRD_share_june - THIRD_share_march) %>%
  summarize(
    across(AKP_share_march:THIRD_share_diff, list(min=min, max=max))
  ) %>%
  pivot_longer(AKP_share_march_min:THIRD_share_diff_max, names_to = "statistic", values_to = "value") %>%
  mutate(across(value, round, 2)) %>%
  separate(col=statistic, into=c("outcome", "statistic"), sep = -3) %>%
  pivot_wider(names_from = statistic, values_from = value)  
```

### The neigborhood level
```{R}
df_results %>%
  group_by(neighborhood_id) %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june) %>%
  summarize(
    across(AKP_march:REG_june, sum)
  ) %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    AKP_share_march = AKP_march/REG_march, 
    CHP_share_march = CHP_march/REG_march, 
    ABS_share_march = ABS_march/REG_march, 
    THIRD_share_march = THIRD_march/REG_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june,
    AKP_share_june = AKP_june/REG_june, 
    CHP_share_june = CHP_june/REG_june, 
    ABS_share_june = ABS_june/REG_june, 
    THIRD_share_june = THIRD_june/REG_june,
    AKP_share_diff = AKP_share_june - AKP_share_march, 
    CHP_share_diff = CHP_share_june - CHP_share_march, 
    ABS_share_diff = ABS_share_june - ABS_share_march, 
    THIRD_share_diff = THIRD_share_june - THIRD_share_march) %>%
  summarize(
    across(AKP_share_march:THIRD_share_diff, list(min=min, max=max))
  ) %>%
  pivot_longer(AKP_share_march_min:THIRD_share_diff_max, names_to = "statistic", values_to = "value") %>%
  mutate(across(value, round, 2)) %>%
  separate(col=statistic, into=c("outcome", "statistic"), sep = -3) %>%
  pivot_wider(names_from = statistic, values_from = value)  
```
### The district level
```{R}
df_results %>%
  group_by(district_id) %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june) %>%
  summarize(
    across(AKP_march:REG_june, sum)
  ) %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    AKP_share_march = AKP_march/REG_march, 
    CHP_share_march = CHP_march/REG_march, 
    ABS_share_march = ABS_march/REG_march, 
    THIRD_share_march = THIRD_march/REG_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june,
    AKP_share_june = AKP_june/REG_june, 
    CHP_share_june = CHP_june/REG_june, 
    ABS_share_june = ABS_june/REG_june, 
    THIRD_share_june = THIRD_june/REG_june,
    AKP_share_diff = AKP_share_june - AKP_share_march, 
    CHP_share_diff = CHP_share_june - CHP_share_march, 
    ABS_share_diff = ABS_share_june - ABS_share_march, 
    THIRD_share_diff = THIRD_share_june - THIRD_share_march) %>%
  summarize(
    across(AKP_share_march:THIRD_share_diff, list(min=min, max=max))
  ) %>%
  pivot_longer(AKP_share_march_min:THIRD_share_diff_max, names_to = "statistic", values_to = "value") %>%
  mutate(across(value, round, 2)) %>%
  separate(col=statistic, into=c("outcome", "statistic"), sep = -3) %>%
  pivot_wider(names_from = statistic, values_from = value) 
```
### The city level
```{R}
df_results %>%
  mutate(city_id=1) %>%
  group_by(city_id) %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june) %>%
  summarize(
    across(AKP_march:REG_june, sum)
  ) %>%
  mutate(
    REG_march = AKP_march + CHP_march + ABS_march + THIRD_march,
    AKP_share_march = AKP_march/REG_march, 
    CHP_share_march = CHP_march/REG_march, 
    ABS_share_march = ABS_march/REG_march, 
    THIRD_share_march = THIRD_march/REG_march,
    REG_june = AKP_june + CHP_june + ABS_june + THIRD_june,
    AKP_share_june = AKP_june/REG_june, 
    CHP_share_june = CHP_june/REG_june, 
    ABS_share_june = ABS_june/REG_june, 
    THIRD_share_june = THIRD_june/REG_june,
    AKP_share_diff = AKP_share_june - AKP_share_march, 
    CHP_share_diff = CHP_share_june - CHP_share_march, 
    ABS_share_diff = ABS_share_june - ABS_share_march, 
    THIRD_share_diff = THIRD_share_june - THIRD_share_march) %>%
  summarize(
    across(AKP_share_march:THIRD_share_diff, list(min=min, max=max))
  ) %>%
  pivot_longer(AKP_share_march_min:THIRD_share_diff_max, names_to = "statistic", values_to = "value") %>%
  mutate(across(value, round, 2)) %>%
  separate(col=statistic, into=c("outcome", "statistic"), sep = -3) %>%
  pivot_wider(names_from = statistic, values_from = value) 
```


# Post-stratify
* Post-stratification requires non-zero cell counts in the population. This is only an issue with THIRD (PARTY VOTES), which are zero in a number of boxes.
* Solve this by replacing 0 with .000001, but only for the purposes of post-stratification
```{R}
df_results_synthesis <- df_results %>%
  select(box_id, ends_with(c("march", "june"))) %>%
  mutate(THIRD_march = ifelse(THIRD_march==0, .000001, THIRD_march), 
         THIRD_june = ifelse(THIRD_june==0, .000001, THIRD_june))
```


### Go ballot box by ballot box
* this takes about 20hrs; to run remove "eval = FALSE"
* note that post-stratifying takes into account third-party voters, but the joint PMF (and mechanisms) only employs the outcomes AKP, CHP, ABS
* the bootstrap sample is the size of the ballot box, to calibrate uncertainty involved in having a sample of that size
* instead of saving the PMF, save the actual NUMBER of voters in the cross-tab: this will make it easy to aggregate up
```{R, eval = FALSE}
boot_rep <- 1000
n_boxes <- length(df_results_synthesis$box_id)

# SAVE THE MATRIX OF BOOTSTRAPPED Ps INTO THIS LIST, ONE ENTRY PER BALLOT BOX
box_list <- vector("list", n_boxes)

for (i in 1:n_boxes) {

  # Load ballot box level result
  box_id <- df_results_synthesis$box_id[i]
  AKP_march <- df_results_synthesis$AKP_march[i]
  CHP_march <- df_results_synthesis$CHP_march[i] 
  ABS_march <- df_results_synthesis$ABS_march[i]
  THIRD_march <- df_results_synthesis$THIRD_march[i]
  REG_march <- AKP_march + CHP_march + ABS_march + THIRD_march

  AKP_june <- df_results_synthesis$AKP_june[i]
  CHP_june <- df_results_synthesis$CHP_june[i] 
  ABS_june <- df_results_synthesis$ABS_june[i]
  THIRD_june <- df_results_synthesis$THIRD_june[i]

  # Construct the margins that will be used for post-stratifying
  march_freq <- data.frame(vote_march_char=c("AKP","CHP","ABS", "THIRD"), Freq=c(AKP_march, CHP_march, ABS_march, THIRD_march))
  june_freq <- data.frame(vote_june_char=c("AKP","CHP","ABS", "THIRD"), Freq=c(AKP_june, CHP_june, ABS_june, THIRD_june))
  df_raked <- rake(df_torake, list(~vote_march_char, ~vote_june_char), list(march_freq, june_freq), control = list(maxit = 1000, epsilon = 1e-7, verbose=FALSE))

  # Extract weights and use them as bootstrap probabilities
  df_toboot <- df_surveys_synthesis %>%
    mutate(rid = seq(1:nrow(df_surveys_synthesis))) %>%
    mutate(weight = weights(df_raked)) %>%
    select(rid, vote_march, vote_june, vote_march_3, vote_june_3, weight) 

  # BOOTSTRAP 
  p_boot_save <- c()
  for (j in 1:boot_rep) {
    # DRAW RIDS
    rid_boot <- sample(x=df_toboot$rid, size= REG_march, replace=TRUE, prob=df_toboot$weight)
    # EXTRACT THE DATA BASED ON THE DRAW
    df_boot <- df_toboot[rid_boot,]
    # COMPUTE THE CROSS-TAB
    p_boot <- table(df_boot$vote_march_3, df_boot$vote_june_3)
    #rowSums(p_boot)
    #colSums(p_boot)
    # TRANSFORM TO A VECTOR THAT IS A ROWWISE TRANSFORMATION OF THE MATRIX
    p_boot <- c(t(p_boot))
    # SAVE IT
    p_boot_save <- rbind(p_boot_save, p_boot)
  }
  
box_list[[i]] <- p_boot_save
}
save(box_list, file = "temp/box_list.RData")
```

# AGGREGATE THE JOINT PMF/CROSS-TAB TO THE CITY LEVEL
* Takes about 12hrs; to run remove "eval = FALSE"
* Go boot-draw by boot-draw: each entry in the list is a ballot box, each row in the ballot box df is one bootstrap draw
* p_boot_city: the city-level P matrix for each boot draw
```{R, eval = FALSE}
load("temp/box_list.RData")

# LOAD THE RESULTS DATA TO GET NUMBER OF BOXES
load("data/df_2019istanbul_results.RData")

boot_rep <- 1000
p_boot_city_list <- vector("list", boot_rep)
n_boxes <- length(df_results$box_id)

p_boot_city <- c()
for (j in 1:boot_rep) {
  #print(j)
  p_boot_extract <- c()
  for (i in 1:n_boxes) {
    # EXTRACT jTH BOOT DRAW FOR ALL BOXES
    p_boot_extract <- rbind(p_boot_extract, box_list[[i]][j,])
    # AGGREGATE jTH BOOT DRAW TO THE CITY LEVEL
    p_boot_city_j <-  apply(p_boot_extract, 2, sum)
  }
  p_boot_city_list[[j]] <- p_boot_city_j
} 

# TURN THE LIST INTO A DATA FRAME AND SAVE
save(p_boot_city_list, file = "temp/p_boot_city.RData")
```

# Compute the mechanisms at the CITY LEVEL
```{r}
load("temp/p_boot_city.RData")

boot_rep <- 1000
#boot_rep <- 10

P_boot <- c()
switching_boot <- c()
backlash_boot <- c()
disenchantment_boot <- c()
  
for (k in 1:boot_rep) {
  P_boot <- rbind(P_boot, p_boot_city_list[[k]])
  P <- matrix(p_boot_city_list[[k]], nrow=3, byrow=TRUE)
  
  switching <- P[1,2] - P[2,1]
  switching_boot <- rbind(switching_boot, switching)
  
  backlash <- P[3,2] - P[2,3]
  backlash_boot <- rbind(backlash_boot, backlash)
  
  disenchantment <- P[1,3] - P[3,1]
  disenchantment_boot <- rbind(disenchantment_boot, disenchantment)
}    

#### Summarize the transition matrix
P_boot_mean <- apply(P_boot, 2, mean, na.rm=TRUE)
P_boot_mean <- matrix(P_boot_mean, nrow=3, byrow=TRUE)
P_boot_lower <- apply(P_boot, 2, quantile, probs=c(.025), na.rm=TRUE)
P_boot_lower <- matrix(P_boot_lower, nrow=3, byrow=TRUE)
P_boot_upper <- apply(P_boot, 2, quantile, probs=c(.975), na.rm=TRUE)
P_boot_upper <- matrix(P_boot_upper, nrow=3, byrow=TRUE)

#### Summarize the mechanisms
switching_boot_mean <- apply(switching_boot, 2, mean, na.rm=TRUE)
switching_boot_lower <- apply(switching_boot, 2, quantile, probs=c(.025), na.rm=TRUE)
switching_boot_upper <- apply(switching_boot, 2, quantile, probs=c(.975), na.rm=TRUE)

backlash_boot_mean <- apply(backlash_boot, 2, mean, na.rm=TRUE)
backlash_boot_lower <- apply(backlash_boot, 2, quantile, probs=c(.025), na.rm=TRUE)
backlash_boot_upper <- apply(backlash_boot, 2, quantile, probs=c(.975), na.rm=TRUE)

disenchantment_boot_mean <- apply(disenchantment_boot, 2, mean, na.rm=TRUE)
disenchantment_boot_lower <- apply(disenchantment_boot, 2, quantile, probs=c(.025), na.rm=TRUE)
disenchantment_boot_upper <- apply(disenchantment_boot, 2, quantile, probs=c(.975), na.rm=TRUE)

# CONSOLIDATE
switching_boot_out <- c(switching_boot_mean, switching_boot_lower, switching_boot_upper)
backlash_boot_out <- c(backlash_boot_mean, backlash_boot_lower, backlash_boot_upper)
disenchantment_boot_out <- c(disenchantment_boot_mean, disenchantment_boot_lower, disenchantment_boot_upper)
```

# SUMMARIZE THE CITY-WIDE MECHS AND SHIFTS
```{R}
round(P_boot_mean)
round(P_boot_lower)
round(P_boot_upper)
round(P_boot_lower - P_boot_upper)

round(100*switching_boot_out/sum(P), 2)
round(100*backlash_boot_out/sum(P), 2)
round(100*disenchantment_boot_out/sum(P), 2)
```

## SUMMARIZE THE NUMBER OF VOTERS THAT PARTICIPATED IN EACH MECHANISM
```{R}
P_boot_mean
count_total <- sum(P_boot_mean)
count_total

# NO CHANGE IN THEIR ACTION
count_nochange <- P_boot_mean[1,1] + P_boot_mean[2,2] + P_boot_mean[3,3]
count_nochange
round(count_nochange/10^6, 1)
count_nochange/count_total

# CHANGE
count_total - count_nochange
round((count_total - count_nochange)/10^6, 1)
(count_total - count_nochange)/count_total

# SWITCHING
P_boot_mean[1,2]
P_boot_mean[2,1]
P_boot_mean[1,2] - P_boot_mean[2,1]

# BACKLASH
P_boot_mean[3,2]
P_boot_mean[2,3]
P_boot_mean[3,2] - P_boot_mean[2,3]

# DISENCHANTMENT
P_boot_mean[1,3]
P_boot_mean[3,1]
P_boot_mean[1,3] - P_boot_mean[3,1]
```

